// OpentTxl-C Version 11 charset
// J.R. Cordy, Jan 2023

// Copyright 2023, James R. Cordy and others

// Permission is hereby granted, free of charge, to any person obtaining a copy of this software 
// and associated documentation files (the “Software”), to deal in the Software without restriction, 
// including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, 
// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, 
// subject to the following conditions:

// The above copyright notice and this permission notice shall be included in all copies 
// or substantial portions of the Software.

// THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
// INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE 
// AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, 
// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

// The TXL character set tables.
// Define and initialize character class tables used by the input scanner and output printer.
// Rapid character classification using array subscripting is key to fast scanning of input.
// Output spacing tables determine the default spacing between output tokens based on character adjacency.

// Modification Log

// v11.0 Initial revision, adapted from OpenTxl 11.0

// v11.1 Added NBSP (ASCII 160) as space character and separator

// v11.2 Changed default â, Â and Ã, which conflict with UTF-8

// I/O, strings, memory allocation
#include "support.h"

// Check interface consistency
#include "charset.h"

// Character classes for Extended ASCII (Latin-1)
charset_propertyT charset_digitP, charset_alphaP, charset_alphaidP, charset_idP, charset_upperP, 
    charset_upperidP, charset_lowerP, charset_loweridP, charset_specialP, charset_repeaterP, 
    charset_optionalP, charset_separatorP, charset_spaceP, charset_metaP, charset_magicP;

// The false property, used to initialize charset properties
charset_propertyT charset_falseP;

// TXL string and character literal quote escape characters
char charset_stringlitEscapeChar;
char charset_charlitEscapeChar;

// TXL output spacing tables
charset_propertyT charset_spaceBeforeP;
charset_propertyT charset_spaceAfterP;

// Upper-to-lower and lower-to-upper case maps
charset_mapT charset_uppercase;
charset_mapT charset_lowercase;

// Charset property test for entire strings
bool charset_uniformlyP (const longstring tokenText, const charset_propertyT propertyP)
{
    for (int i = 1; i <= lstringlen(tokenText); i++) {
        char nextchar = tokenText[i - 1];
        if (!(propertyP[(unsigned char) nextchar])) {
            return (false);
        }
    }
    return (true);
}

// Modify charset properties
void charset_addIdChar (const char c, const bool setting)
{
    assert (c != 0);
    charset_idP[(unsigned char) c] = setting;
    charset_alphaidP[(unsigned char) c] = setting;
    charset_upperidP[(unsigned char) c] = setting;
    charset_loweridP[(unsigned char) c] = setting;
    charset_spaceAfterP[(unsigned char) c] = setting;
}

void charset_addSpaceChar (const char c, const bool setting)
{
    charset_spaceP[(unsigned char) c] = setting;
}

void charset_setEscapeChar (const char c, const bool setting)
{
    if (setting) {
        charset_stringlitEscapeChar = c;
        if ((charset_stringlitEscapeChar == '\'') || (charset_stringlitEscapeChar == '"')) {
            charset_stringlitEscapeChar = '"';
            charset_charlitEscapeChar = '\'';
        } else {
            charset_charlitEscapeChar = charset_stringlitEscapeChar;
        }
    } else {
        charset_stringlitEscapeChar = ' ';
        charset_charlitEscapeChar = ' ';
    }
}

// XML output encoding of special characters
void charset_putXmlCode (const int outstream, const longstring ls)
{
    for (int i = 1; i <= lstringlen(ls); i++) {
        string lsi;
        substring (lsi, ls, i, i);
        switch (stringchar (lsi, 1)) {
            case '&':
                { stringcpy(lsi, "&amp;"); }
                break;
            case '<':
                { stringcpy(lsi, "&lt;"); }
                break;
            case '>':
                { stringcpy(lsi, "&gt;"); }
                break;
            case '"':
                { stringcpy(lsi, "&quot;"); }
                break;
            case '\'':
                { stringcpy(lsi, "&apos;"); }
                break;
            default :
                break;
        }
        fprintf (tffile (outstream), "%s", lsi);
    }
}

// Initialization of the property tables
void charset (void) {
    // The false property, to initialize charset properties
    for (int c = 0; c <= 255; c++) {
        charset_falseP[(unsigned char) c] = false;
    }

    // Digits
    arrayassign (charset_digitP, charset_falseP);
    for (char c = '0'; c <= '9'; c++) {
        charset_digitP[(unsigned char) c] = true;
    }

    // Upper case letters
    arrayassign (charset_upperP, charset_falseP);
    // A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
    for (char c = 'A'; c <= 'Z'; c++) {
        charset_upperP[(unsigned char) c] = true;
    }
#ifdef LATIN1
    // À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï Ð Ñ Ò Ó Ô Õ Ö
    for (int c = 192; c <= 214; c++) {
        charset_upperP[(unsigned char) c] = true;
    }
   #ifdef UNICODE
        // Latin-1 Â and Ã conflict with UTF-8
        charset_upperP [(unsigned char) 194] = false;
        charset_upperP [(unsigned char) 195] = false;
    #endif
    // Ø Ù Ú Û Ü Ý Þ
    for (int c = 216; c <= 222; c++) {
        charset_upperP[(unsigned char) c] = true;
    }
    // Š, Œ, Ž, Ÿ
    charset_upperP[(unsigned char) 138] = true;
    charset_upperP[(unsigned char) 140] = true;
    charset_upperP[(unsigned char) 142] = true;
    charset_upperP[(unsigned char) 159] = true;
    // ß is both
    charset_upperP[(unsigned char) 223] = true;
#endif

    // Lower case letters
    arrayassign (charset_lowerP, charset_falseP);
    for (char c = 'a'; c <= 'z'; c++) {
        charset_lowerP[(unsigned char) c] = true;
    }
#ifdef LATIN1
    // à á â ã ä å æ ç è é ê ë ì í î ï ð ñ ò ó ô õ ö
    for (int c = 224; c <= 246; c++) {
        charset_lowerP[(unsigned char) c] = true;
    }
    #ifdef UNICODE
        // Latin-1 â conflicts with UTF-8
        charset_lowerP [(unsigned char) 226] = false;
    #endif
    // ø ù ú û ü ý þ 
    for (int c = 248; c <= 254; c++) {
        charset_lowerP[(unsigned char) c] = true;
    }
    // š, œ, ž, ÿ
    charset_lowerP[(unsigned char) 154] = true;
    charset_lowerP[(unsigned char) 156] = true;
    charset_lowerP[(unsigned char) 158] = true;
    charset_lowerP[(unsigned char) 255] = true;
    // ß is both
    charset_lowerP[(unsigned char) 223] = true;
#endif

    // Alphabetics = Upper case + Lower case
    arrayassign (charset_alphaP, charset_falseP);
    for (int c = 0; c <= 255; c++) {
        charset_alphaP[(unsigned char) c] = (charset_upperP[(unsigned char) c]) || (charset_lowerP[(unsigned char) c]);
    }

    // Alphabetic identifiers = Alphabetics + underscore
    arrayassign (charset_alphaidP, charset_alphaP);
    charset_alphaidP['_'] = 1;

    // Identifiers = Alphabetic identifiers + Digits
    for (int c = 0; c <= 255; c++) {
        charset_idP[(unsigned char) c] = (charset_alphaidP[(unsigned char) c]) || (charset_digitP[(unsigned char) c]);
    }

    // Upper case identifiers = Upper case letters + Digits + underscore
    for (int c = 0; c <= 255; c++) {
        charset_upperidP[(unsigned char) c] = (charset_upperP[(unsigned char) c]) || (charset_digitP[(unsigned char) c]);
    }
    charset_upperidP['_'] = true;

    // Lower case identifiers = Lower case letters + Digits + underscore
    for (int c = 0; c <= 255; c++) {
        charset_loweridP[(unsigned char) c] = (charset_lowerP[(unsigned char) c]) || (charset_digitP[(unsigned char) c]);
    }
    charset_loweridP['_'] = true;

    // Special characters
    arrayassign (charset_specialP, charset_falseP);
    // ! " # $ % & ' () * + , - .  /
    for (int c = '!'; c <= '/'; c++) {
        charset_specialP[(unsigned char) c] = true;
    }
    // except quotes and parens
    charset_specialP['"'] = false;
    charset_specialP['\''] = false;
    charset_specialP['('] = false;
    charset_specialP[')'] = false;
    // : ; < = > ?  @
    for (int c = ':'; c <= '@'; c++) {
        charset_specialP[(unsigned char) c] = true;
    }
    // [ \ ] ^ _ `
    for (int c = '['; c <= '`'; c++) {
        charset_specialP[(unsigned char) c] = true;
    }
    // except brackets and backslash
    charset_specialP['['] = false;
    charset_specialP['\\'] = false;
    charset_specialP[']'] = false;
    // { | } ~
    for (int c = '{'; c <= '~'; c++) {
        charset_specialP[(unsigned char) c] = true;
    }
#ifdef LATIN1
    // € ‚ ƒ „ … † ‡ ˆ ‰ 
    for (int c = 128; c <= 137; c++) {
        charset_specialP[(unsigned char) c] = true;
    }
    // ‹
    charset_specialP[(unsigned char) 139] = true;
    // ‘ ’ “ ” • – — ˜ ™ 
    for (int c = 145; c <= 153; c++) {
        charset_specialP[(unsigned char) c] = true;
    }
    // ›
    charset_specialP[(unsigned char) 155] = true;
    // ¡ ¢ £ ¤ ¥ ¦ § ¨ © ª « ¬ ­ ® ¯ ° ± ² ³ ´ µ ¶ · ¸ ¹ º » ¼ ½ ¾ ¿
    for (int c = 161; c <= 191; c++) {
        charset_specialP[(unsigned char) c] = true;
    }
    // ×
    charset_specialP[(unsigned char) 215] = true;
    // ÷
    charset_specialP[(unsigned char) 247] = true;
#endif

    // Separators
    arrayassign (charset_separatorP, charset_falseP);
    charset_separatorP['\0'] = true;
    charset_separatorP['\t'] = true;
    charset_separatorP['\n'] = true;
    charset_separatorP['\f'] = true;
    charset_separatorP['\r'] = true;
    charset_separatorP[' '] = true;
#ifdef LATIN1
    charset_separatorP[(unsigned char) 160] = true;
#endif

    // White space
    arrayassign (charset_spaceP, charset_falseP);
    charset_spaceP['\t'] = true;
    charset_spaceP['\n'] = true;
    charset_spaceP['\f'] = true;
    charset_spaceP['\r'] = true;
    charset_spaceP[' '] = true;
#ifdef LATIN1
    charset_spaceP[(unsigned char) 160] = true;
#endif

    // TXL token pattern metacharacters

    // TXL token pattern repeaters
    arrayassign (charset_repeaterP, charset_falseP);
    charset_repeaterP['*'] = true;
    charset_repeaterP['+'] = true;

    // TXL token pattern optionals
    arrayassign (charset_optionalP, charset_falseP);
    charset_optionalP['*'] = true;
    charset_optionalP['?'] = true;

    // TXL token pattern metacharacters
    arrayassign (charset_metaP, charset_falseP);
    charset_metaP['#'] = true;
    charset_metaP['('] = true;
    charset_metaP[')'] = true;
    charset_metaP['*'] = true;
    charset_metaP['+'] = true;
    charset_metaP['?'] = true;
    charset_metaP['['] = true;
    charset_metaP['\\'] = true;
    charset_metaP[']'] = true;

    // TXL token pattern metacharacter magic codes
    arrayassign (charset_magicP, charset_falseP);
    charset_magicP['\0'] = true;

    // TXL string and character literal quote escape characters
    // ' ' = none; '\\' = backslash; '\'' or '"' = ' for charlits, " for stringlits
    charset_stringlitEscapeChar = '\\';         // default \ to match default stringlit pattern
    charset_charlitEscapeChar = '\\';           // default \ to match default charlit pattern

    // TXL output spacing tables
    for (int c = 0; c <= 255; c++) {
        charset_spaceBeforeP[(unsigned char) c] = true;
        charset_spaceAfterP[(unsigned char) c] = true;
    }

    // By default, we always space before any token except those that begin with:
    charset_spaceBeforeP['\t'] = false;
    charset_spaceBeforeP['\n'] = false;
    charset_spaceBeforeP['\r'] = false;
    charset_spaceBeforeP[' '] = false;
    charset_spaceBeforeP[')'] = false;
    charset_spaceBeforeP[','] = false;
    charset_spaceBeforeP[';'] = false;
    charset_spaceBeforeP['.'] = false;
    charset_spaceBeforeP[']'] = false;
    charset_spaceBeforeP['}'] = false;
#ifdef LATIN1
    charset_spaceBeforeP[(unsigned char) 160] = false;
#endif

    // By default, we always space after any token except those that end with:
    charset_spaceAfterP['\t'] = false;
    charset_spaceAfterP['\n'] = false;
    charset_spaceAfterP['\r'] = false;
    charset_spaceAfterP[' '] = false;
    charset_spaceAfterP['('] = false;
    charset_spaceAfterP['.'] = false;
    charset_spaceAfterP['['] = false;
    charset_spaceAfterP['{'] = false;
#ifdef LATIN1
    charset_spaceBeforeP[(unsigned char) 160] = false;
#endif

    // Upper-to-lower and lower-to-upper case maps
    for (int c = 0; c <= 255; c++) {
        charset_uppercase[(unsigned char) c] = c;
        charset_lowercase[(unsigned char) c] = c;
    }
    // A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
    for (int c = 'A'; c <= 'Z'; c++) {
        charset_lowercase[(unsigned char) c] = c - 'A' + 'a';
    }
    // a b c d e f g h i j k l m n o p q r s t u v w x y z
    for (int c = 'a'; c <= 'z'; c++) {
        charset_uppercase[(unsigned char) c] = c - 'a' + 'A';
    }

#ifdef LATIN1
    // Have to fool T+, who thinks these are illegal characters
    int uc, lc;
    // À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï Ð Ñ Ò Ó Ô Õ Ö
    // à á â ã ä å æ ç è é ê ë ì í î ï ð ñ ò ó ô õ ö
    uc = 192; lc = 224; 
    for (int c = uc; c <= 214; c++) {
        charset_lowercase[(unsigned char) c] = c - uc + lc;
    }
    for (int c = lc; c <= 246; c++) {
        charset_uppercase[(unsigned char) c] = c - lc + uc;
    }
    // Ø Ù Ú Û Ü Ý Þ
    // ø ù ú û ü ý þ 
    uc = 216; lc = 248; 
    for (int c = uc; c <= 222; c++) {
        charset_lowercase[(unsigned char) c] = c - uc + lc;
    }
    for (int c = lc; c <= 254; c++) {
        charset_uppercase[(unsigned char) c] = c - lc + uc;
    }
    // Š š
    uc = 138; lc = 154;
    charset_lowercase[(unsigned char) uc] = lc;
    charset_uppercase[(unsigned char) lc] = uc;
    // Œ œ
    uc = 140; lc = 156;
    charset_lowercase[(unsigned char) uc] = lc;
    charset_uppercase[(unsigned char) lc] = uc;
    // Ž ž
    uc = 142; lc = 158;
    charset_lowercase[(unsigned char) uc] = lc;
    charset_uppercase[(unsigned char) lc] = uc;
    // Ÿ ÿ
    uc = 159; lc = 255;
    charset_lowercase[(unsigned char) uc] = lc;
    charset_uppercase[(unsigned char) lc] = uc;
#endif
}
